#R libraries

library(knitr, warn.conflicts=F, quietly=T)
library(yaml, warn.conflicts=F, quietly=T)
library(dplyr, warn.conflicts=F, quietly=T)
library(ggplot2, warn.conflicts=F, quietly=T)
library(tidyr, warn.conflicts=F, quietly=T)
require(graphics, warn.conflicts=F, quietly=T)
library(tidyverse, warn.conflicts=F, quietly=T)
## -- Attaching packages --------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr, warn.conflicts=F, quietly=T)
library(GGally, warn.conflicts=F, quietly=T)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggthemes, warn.conflicts=F, quietly=T)
library(plotly, warn.conflicts=F, quietly=T)
library(maps, warn.conflicts=F, quietly=T)
library(stringr, warn.conflicts=F, quietly=T)
library(stringi, warn.conflicts=F, quietly=T)
library(mapproj, warn.conflicts=F, quietly=T)
library(RCurl, warn.conflicts=F, quietly=T)
library(readr, warn.conflicts=F, quietly=T)
library(rio, warn.conflicts=F, quietly=T)
library(naniar, warn.conflicts=F, quietly=T)
library(scales, warn.conflicts=F, quietly=T)
library(grid, warn.conflicts=F, quietly=T)
library(mice, warn.conflicts=F, quietly=T)
library(class, warn.conflicts=F, quietly=T)
library(caret, warn.conflicts=F, quietly=T)
library(e1071, warn.conflicts=F, quietly=T)
library(datasets, warn.conflicts=F, quietly=T)
#install.packages("investr")
library(investr, warn.conflicts=F, quietly=T)
library(fpp, warn.conflicts=F, quietly=T)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'fma'
## The following object is masked from 'package:maps':
## 
##     ozone
## The following object is masked from 'package:GGally':
## 
##     pigs
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'lmtest'
## The following object is masked from 'package:RCurl':
## 
##     reset
library(fpp2, warn.conflicts=F, quietly=T)
library(shiny, warn.conflicts=F, quietly=T)

#Employee data input from CSV files

# This reads in the Employee data from select folder file CaseStudy2-data.csv.
Fulldata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306-CaseStudy02-Attrition/main/CaseStudy2-data.csv",
                     header = T,sep = ",",na.strings = "NA",fill = TRUE)
head(Fulldata)
##   ID Age Attrition    BusinessTravel DailyRate             Department
## 1  1  32        No     Travel_Rarely       117                  Sales
## 2  2  40        No     Travel_Rarely      1308 Research & Development
## 3  3  35        No Travel_Frequently       200 Research & Development
## 4  4  32        No     Travel_Rarely       801                  Sales
## 5  5  24        No Travel_Frequently       567 Research & Development
## 6  6  27        No Travel_Frequently       294 Research & Development
##   DistanceFromHome Education   EducationField EmployeeCount EmployeeNumber
## 1               13         4    Life Sciences             1            859
## 2               14         3          Medical             1           1128
## 3               18         2    Life Sciences             1           1412
## 4                1         4        Marketing             1           2016
## 5                2         1 Technical Degree             1           1646
## 6               10         2    Life Sciences             1            733
##   EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1                       2   Male         73              3        2
## 2                       3   Male         44              2        5
## 3                       3   Male         60              3        3
## 4                       3 Female         48              3        3
## 5                       1 Female         32              3        1
## 6                       4   Male         32              3        3
##                  JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1        Sales Executive               4      Divorced          4403
## 2      Research Director               3        Single         19626
## 3 Manufacturing Director               4        Single          9362
## 4        Sales Executive               4       Married         10422
## 5     Research Scientist               4        Single          3760
## 6 Manufacturing Director               1      Divorced          8793
##   MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike
## 1        9250                  2      Y       No                11
## 2       17544                  1      Y       No                14
## 3       19944                  2      Y       No                11
## 4       24032                  1      Y       No                19
## 5       17218                  1      Y      Yes                13
## 6        4809                  1      Y       No                21
##   PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1                 3                        3            80                1
## 2                 3                        1            80                0
## 3                 3                        3            80                0
## 4                 3                        3            80                2
## 5                 3                        3            80                0
## 6                 4                        3            80                2
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     3               2              5
## 2                21                     2               4             20
## 3                10                     2               3              2
## 4                14                     3               3             14
## 5                 6                     2               3              6
## 6                 9                     4               2              9
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  2                       0                    3
## 2                  7                       4                    9
## 3                  2                       2                    2
## 4                 10                       5                    7
## 5                  3                       1                    3
## 6                  7                       1                    7

All the variables of given data are shown above.

# Read pagination function from github
source("https://raw.githubusercontent.com/VenkataVanga/MSDS6371-Project-Final/main/Facetwrap_Pagination.R")

#While loop below will plot for all the data in 5 Pages.
#Variables are sorted in alphabetical order.
i <- 1
while (i<5) {
  pData <- ggplot(gather(Fulldata, cols, value), aes(x = value)) + 
       geom_histogram(stat = 'count') +
    ggtitle(paste("Attrition data Visualization",i)) + 
    facet_wrap_paginate(.~cols, ncol = 3, nrow = 3, page = i)
  i = i+1
  print(pData) 
}
## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Ignoring unknown parameters: binwidth, bins, pad

#Analysis of factors effecting Attrition.

Below are different factors that are considered to have significant effect on attrition.

#Factor#1 – Gender

# Attrition data from full data
Dat_Attrition <- Fulldata %>% filter(Attrition == "Yes")

# Non-Attrition data from full data
Dat_NonAttrition <- Fulldata %>% filter(Attrition == "No")

#summary of Gender - Full data
S_GenderFulldata <- Fulldata %>% group_by(Gender)%>% summarize(count = n()) %>%
 mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of Gender - Non Attrition
S_GenderNonAttrition <- Dat_NonAttrition %>% group_by(Gender)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of Gender - Attrition
S_GenderAttrition <- Dat_Attrition %>% group_by(Gender)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#Full data gender plot with numbers and percentage
Full_Gen <- ggplot(Fulldata, aes(x = Gender,fill = Gender)) + 
  geom_bar(show.legend = FALSE) + 
  ggtitle("Full Data Female / Male count") +
  geom_text(aes(Gender, count+10, label = count), 
            data = S_GenderFulldata) + 
  geom_text(aes(Gender, count/2, 
                label = percent(pct)), 
            data = S_GenderFulldata) + geom_text(aes(Gender[1], max(count)+50, 
                label = paste("Total Full Data Count = ",sum(count))),nudge_x = 0.5, 
            data = S_GenderFulldata) +
  theme(plot.background = element_rect(colour = "black",size = 1))

#Non-attrition data gender plot with numbers and percentage
NonAttrition_Gen <- ggplot(Dat_NonAttrition, aes(x = Gender,fill = Gender)) + 
  geom_bar(show.legend = FALSE) + 
  ggtitle("Non-Attrition Data Female / Male count") +
  geom_text(aes(Gender, count+10, label = count), 
            data = S_GenderNonAttrition) + 
  geom_text(aes(Gender, count/2, 
                label = percent(pct)), 
            data = S_GenderNonAttrition) + geom_text(aes(Gender[1], max(count)+50, 
                label = paste("Total Non-Attrition Data Count = ",sum(count))),
                nudge_x = 0.5, data = S_GenderNonAttrition) + 
  theme(plot.background = element_rect(colour = "black",size = 1))

#Attrition data gender plot with numbers and percentage
Attrition_Gen <- ggplot(Dat_Attrition, aes(x = Gender,fill = Gender)) + 
  geom_bar(show.legend = FALSE) + 
  ggtitle("Attrition Data Female / Male count") +
  geom_text(aes(Gender, count+3, label = count), 
            data = S_GenderAttrition) + 
  geom_text(aes(Gender, count/2, 
                label = percent(pct)), 
            data = S_GenderAttrition)+ geom_text(aes(Gender[1], max(count)+20, 
                label = paste("Total Attrition Data Count = ",sum(count))),
                nudge_x = 0.5, data = S_GenderAttrition) +
  theme(plot.background = element_rect(colour = "black",size = 1))

#Attribute - Gender significance with Attrition

#Grid.draw is used to plot all gender plots at once
grid.draw(cbind(ggplotGrob(Full_Gen),
                ggplotGrob(NonAttrition_Gen),ggplotGrob(Attrition_Gen)))

From the above plots it is clearly evident that the percentage of males [62%] in attrition has increased than the full data [59%] and non-attrition data[59%. There no difference in gender percentages between full data and non-attrition data. This indicates that there is more chance of attrition of male employees as compared to female employees.

#Factor#2 – Age

StatsAge <- Dat_Attrition %>% summarize(Mean = mean(Age),Median = median(Age), 
                                     Max = max(Age), Min = min(Age),
                        SD = sd(Age), N = n())
#Histogram and Density Plot
His_Den <- Dat_Attrition %>% ggplot(aes(x=Age)) +
  geom_histogram(aes(y=..density..),colour='black',fill='white', binwidth = 1) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('Attrition Age - Histogram, Density and Box Plots') + labs(y="Density / Count") + scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))

His_DenAge <- His_Den +  scale_y_continuous("Density", 
                                    sec.axis = sec_axis(~ . *sum(ggplot_build(His_Den)$data[[1]]$count), name = "Count"))

#Box plot for Age
Box <- Dat_Attrition %>% ggplot(aes(x=Age)) + 
  geom_boxplot(col='black',fill='#FFFF00') + scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))

#Male Age - attrition

StatsMaleAge <- Dat_Attrition %>% filter(Gender == "Male") %>%
  summarize(Mean = mean(Age),Median = median(Age),Max = max(Age), 
          Min = min(Age),SD = sd(Age), N = n())
#Histogram and Density Plot Male
His_DenMale <- Dat_Attrition %>% filter(Gender == "Male") %>% ggplot(aes(x=Age)) +
  geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 1) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('Attrition Age (Male) - Histogram, Density and Box Plots') + labs(y="Density / Count") + scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))

His_DenMale1 <- His_DenMale +  scale_y_continuous("Density", 
                                    sec.axis = sec_axis(~ . *sum(ggplot_build(His_DenMale)$data[[1]]$count), name = "Count"))

#Box plot for Age Male
BoxMale <- Dat_Attrition %>% filter(Gender == "Male") %>% ggplot(aes(x=Age)) + 
  geom_boxplot(col='black',fill='#FF6666') + scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))


#Female Age - attrition

StatsFemaleAge <- Dat_Attrition %>% filter(Gender == "Female") %>% 
  summarize(Mean = mean(Age),Median = median(Age),Max = max(Age), 
          Min = min(Age),SD = sd(Age), N = n())

#Histogram and Density Plot Female
His_DenFemale <- Dat_Attrition %>% filter(Gender == "Female") %>% ggplot(aes(x=Age)) +
  geom_histogram(aes(y=..density..),colour='blue',fill='#FF6666', binwidth = 1) +
  geom_density(alpha=.4, fill='#FFFF00') + 
  ggtitle('Attrition Age (Female) - Histogram, Density and Box Plots') + 
  scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))

His_DenFemale1 <- His_DenFemale +  scale_y_continuous("Density", 
                                    sec.axis = sec_axis(~ . *sum(ggplot_build(His_DenFemale)$data[[1]]$count), name = "Count"))

#Box plot for Age Female
BoxFemale <- Dat_Attrition %>% filter(Gender == "Female") %>% ggplot(aes(x=Age)) + 
  geom_boxplot(col='black',fill='blue') + scale_x_continuous(breaks = c(20,25,30,35,40,45,50,55,60))

#Attribute - Age significance with Attrition

#Full data Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_DenAge),
                ggplotGrob(Box),
                size = "first"))

StatsAge
##       Mean Median Max Min       SD   N
## 1 33.78571     32  58  18 9.614726 140
#Male data Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_DenMale1),
                ggplotGrob(BoxMale),
                size = "first"))

StatsMaleAge
##       Mean Median Max Min       SD  N
## 1 33.75862     32  58  18 9.400601 87
#Female data Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_DenFemale1),
                ggplotGrob(BoxFemale),
                size = "first"))

StatsFemaleAge
##       Mean Median Max Min       SD  N
## 1 33.83019     32  58  18 10.04746 53

Considering age as one of the attribute for attrition, the age between 27 years to 34 years are the significant years for both Male and Female where major attrition takes place.

#Factor#3 – JobLevel

#summary of JobLevel - Full data
S_JobFulldata <- Fulldata %>% group_by(JobLevel)%>% summarize(count = n()) %>%
  mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of JobLevel - Non Attrition
S_JobNonAttrition <- Dat_NonAttrition %>% group_by(JobLevel)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of JobLevel - Attrition
S_JobAttrition <- Dat_Attrition %>% group_by(JobLevel)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
FullJob <- ggplot(Fulldata, aes(x = JobLevel, group = JobLevel)) + 
  geom_bar(aes(fill=as.factor(JobLevel)),show.legend = FALSE) + 
  ggtitle("Full Data JobLevel") +
  geom_text(aes(JobLevel, count+10, label = count), 
            data = S_JobFulldata) + 
  geom_text(aes(JobLevel, count/2, 
                label = percent(pct)), 
            data = S_JobFulldata) + geom_text(aes(JobLevel[3], max(count)+50, 
                label = paste("Total Job Level (Full Data) Count = ",sum(count))),
                data = S_JobFulldata) +
  theme(plot.background = element_rect(colour = "black",size = 1))

NonAttritionJob <- ggplot(Dat_NonAttrition, aes(x = JobLevel, group = JobLevel)) + 
  geom_bar(aes(fill=as.factor(JobLevel)),show.legend = FALSE) + 
  ggtitle("Non-Attrition Data JobLevel") +
  geom_text(aes(JobLevel, count+10, label = count), 
            data = S_JobNonAttrition) + 
  geom_text(aes(JobLevel, count/2, 
                label = percent(pct)), 
            data = S_JobNonAttrition) + geom_text(aes(JobLevel[3], max(count)+20, 
                label = paste("Total Job Level (Non-Attrition Data) Count = ",
                              sum(count))), data = S_JobFulldata) +
  theme(plot.background = element_rect(colour = "black",size = 1))

AttritionJob <- ggplot(Dat_Attrition, aes(x = JobLevel, group = JobLevel)) + 
  geom_bar(aes(fill=as.factor(JobLevel)),show.legend = FALSE) + 
  ggtitle("Attrition Data JobLevel") +
  geom_text(aes(JobLevel, count+2, label = count), 
            data = S_JobAttrition) + 
  geom_text(aes(JobLevel, count/2, 
                label = percent(pct)), 
            data = S_JobAttrition) + geom_text(aes(JobLevel[3], max(count)+20, 
                label = paste("Total Job Level (Attrition Data) Count = ",
                              sum(count))), data = S_JobAttrition) +
  theme(plot.background = element_rect(colour = "black",size = 1))

#Attribute - JobLevel significance with Attrition

grid.draw(cbind(ggplotGrob(FullJob),
                ggplotGrob(NonAttritionJob),ggplotGrob(AttritionJob)))

From above plots it can be visualized that there are more employees working joblevel 1 and joblevel 2 in the given data. Full data [37.8% + 35.9%=73.7%] and non-attrition data [33.3% + 38.6%=71.9%] have approximately same joblevel 1 and joblevel 2 combined percentages. But, the attrition data shows significantly higher attrition percentage for joblevel 1 [61.43%] and a combined joblevel 1 & 2 attrition percentage of 82.86% (61.43% + 21.43%). This indicates a approximate 9% - 10% combined joblevel 1 & 2 attrition percentage increase, which signifies major attrition is occurring in joblevel 1 followed by joblevel 2.

#Factor#4 – JobInvolvement

#summary of JobInvolvement - Full data
S_InvolveFulldata <- Fulldata %>% group_by(JobInvolvement)%>% 
  summarize(count = n()) %>%
  mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of JobInvolvement - Non Attrition
S_InvolveNonAttrition <- Dat_NonAttrition %>% group_by(JobInvolvement)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
#summary of JobInvolvement - Attrition
S_InvolveAttrition <- Dat_Attrition %>% group_by(JobInvolvement)%>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count))
## `summarise()` ungrouping output (override with `.groups` argument)
Full_Involve <- ggplot(Fulldata, aes(x = JobInvolvement, group = JobInvolvement)) + 
  geom_bar(aes(fill=as.factor(JobInvolvement)),show.legend = FALSE) + 
  ggtitle("Full Data JobInvolvement") +
  geom_text(aes(JobInvolvement, count+10, label = count), 
            data = S_InvolveFulldata) + 
  geom_text(aes(JobInvolvement, count/2, 
                label = percent(pct)), 
            data = S_InvolveFulldata) + 
  geom_text(aes(JobInvolvement[2], max(count)+50,
                label = paste("Total Job Involvement (Full Data) Count = ",
                              sum(count))),
                nudge_x = 0.5, data = S_InvolveFulldata) +
  theme(plot.background = element_rect(colour = "black",size = 1))

NonAttrition_Involve <- ggplot(Dat_NonAttrition, aes(x = JobInvolvement, 
                                                group = JobInvolvement)) + 
  geom_bar(aes(fill=as.factor(JobInvolvement)),show.legend = FALSE) + 
  ggtitle("Non-Attrition Data JobInvolvement") +
  geom_text(aes(JobInvolvement, count+10, label = count), 
            data = S_InvolveNonAttrition) + 
  geom_text(aes(JobInvolvement, count/2, 
                label = percent(pct)), 
            data = S_InvolveNonAttrition) + 
  geom_text(aes(JobInvolvement[2], max(count)+50,
                label = paste("Total Job Involvement (Non-Attrition Data) Count = ",
                              sum(count))),
                nudge_x = 0.5, data = S_InvolveNonAttrition) +
  theme(plot.background = element_rect(colour = "black",size = 1))

Attrition_Involve <- ggplot(Dat_Attrition, 
                       aes(x = JobInvolvement, group = JobInvolvement)) + 
  geom_bar(aes(fill=as.factor(JobInvolvement)),show.legend = FALSE) + 
  ggtitle("Attrition Data JobInvolvement") +
  geom_text(aes(JobInvolvement, count+2, label = count), 
            data = S_InvolveAttrition) + 
  geom_text(aes(JobInvolvement, count/2, 
                label = percent(pct)), 
            data = S_InvolveAttrition) + 
  geom_text(aes(JobInvolvement[2], max(count)+20,
                label = paste("Total Job Involvement (Attrition Data) Count = ",
                              sum(count))),
                nudge_x = 0.5, data = S_InvolveAttrition) +
  theme(plot.background = element_rect(colour = "black",size = 1))

#Attribute - JobInvolvement significance with Attrition

grid.draw(cbind(ggplotGrob(Full_Involve),
                ggplotGrob(NonAttrition_Involve),ggplotGrob(Attrition_Involve)))

From above plots it can be observed that there are more employees with job involvement 3 [59.1%] in the given data. Below are few details discussing the significance in numbers: Full data - job involvement 1 and 2 category has [5.4% + 26.2% = 31.6%]. NonAttrition data - job involvement 1 and 2 category has [3.4% + 25.2% = 28.6%]. Attrition data - job involvement 1 and 2 category has [16% + 31% = 47%]. This indicates there is approximately 15.4% - 18.4% increase in attrition for job involvement 1 and 2 categories. This signifies major attrition in job involvement category 1 category followed by job involvement category 2.

Monthly Income is the significant attribute for he employers so adding monthly income to the list of factors to visualize for attrition.

p <- ggplot(Fulldata, aes(x=TotalWorkingYears, y=MonthlyIncome)) + 
  geom_point(aes(shape = Attrition, color = Attrition), size=2.5) + 
    ggtitle('MonthlyIncome v. Total Working years - Full Data') + 
    theme(# AXIS LABLES APPEARANCE
  plot.title = element_text(size=14, face= "bold", colour= "black" ),
  axis.title.x = element_text(size=12, face="bold", colour = "black"),    
  axis.title.y = element_text(size=12, face="bold", colour = "black"),    
  axis.text.x = element_text(size=12, face="bold", colour = "black"), 
  axis.text.y = element_text(size=12, face="bold", colour = "black"),
  strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
  strip.text.y = element_text(size = 10, face="bold", colour = "black"),) + 
  scale_x_continuous(name = "Experience - Total Working years") + 
  scale_y_continuous(name = "Monthly Income ($)") + 
  geom_smooth(method = "lm", color="black", size = 1.25)

#from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
lm_eqn <- function(Fulldata){
  m <- lm(MonthlyIncome~TotalWorkingYears,data = Fulldata);
  eq <- substitute(italic("MonthlyIncome") == a %*% + b %*% italic("TotalWorkingYears")*","~~italic(R)^2~"="~r2, 
                   list(a = format(unname(coef(m)[1]), digits = 5),
                        b = format(unname(coef(m)[2]), digits = 5),
                        r2 = format(summary(m)$r.squared, digits = 3)))
  as.character(as.expression(eq));
}

#Plot added with regression equation
p1 <- p + geom_text(x = 10, y = 20000, label = lm_eqn(Fulldata), parse = TRUE)

p1
## `geom_smooth()` using formula 'y ~ x'

The above plot shows Full data Monthly Income vs. Total working year.

#Montly Income non attrition

pNon <- ggplot(Dat_NonAttrition, aes(x=TotalWorkingYears, y=MonthlyIncome)) + 
  geom_point(aes(shape = Attrition, color = Attrition), size=2.5) + 
    ggtitle('MonthlyIncome v. Total Working years - Non Attrition') + 
    theme(# AXIS LABLES APPEARANCE
  plot.title = element_text(size=14, face= "bold", colour= "black" ),
  axis.title.x = element_text(size=12, face="bold", colour = "black"),    
  axis.title.y = element_text(size=12, face="bold", colour = "black"),    
  axis.text.x = element_text(size=12, face="bold", colour = "black"), 
  axis.text.y = element_text(size=12, face="bold", colour = "black"),
  strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
  strip.text.y = element_text(size = 10, face="bold", colour = "black"),) + 
  scale_x_continuous(name = "Experience - Total Working years") + 
  scale_y_continuous(name = "Monthly Income ($)") + 
  geom_smooth(method = "lm", color="black", size = 1.25)

#from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
lm_eqn <- function(Dat_NonAttrition){
  m <- lm(MonthlyIncome~TotalWorkingYears,data = Dat_NonAttrition);
  eq <- substitute(italic("MonthlyIncome") == a %*% + b %*% italic("TotalWorkingYears")*","~~italic(R)^2~"="~r2, 
                   list(a = format(unname(coef(m)[1]), digits = 5),
                        b = format(unname(coef(m)[2]), digits = 5),
                        r2 = format(summary(m)$r.squared, digits = 3)))
  as.character(as.expression(eq));
}

#Plot added with regression equation
p1Non <- pNon + geom_text(x = 10, y = 20000, label = lm_eqn(Dat_NonAttrition), 
                    parse = TRUE)

p1Non
## `geom_smooth()` using formula 'y ~ x'

The above plot shows Non-Attrition data Monthly Income vs. Total working year.

#Montly Income attrition

pAttr <- ggplot(Dat_Attrition, aes(x=TotalWorkingYears, y=MonthlyIncome)) + 
  geom_point(shape = 24, fill = "#30D5C8", aes(color = "Attrition"), size=2.5) + 
    ggtitle('MonthlyIncome v. Total Working years - Attrition') + 
  guides(color=guide_legend(title="Attrition")) + 
    theme(# AXIS LABLES APPEARANCE
  plot.title = element_text(size=14, face= "bold", colour= "black" ),
  axis.title.x = element_text(size=12, face="bold", colour = "black"),    
  axis.title.y = element_text(size=12, face="bold", colour = "black"),    
  axis.text.x = element_text(size=12, face="bold", colour = "black"), 
  axis.text.y = element_text(size=12, face="bold", colour = "black"),
  strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
  strip.text.y = element_text(size = 10, face="bold", colour = "black"),) + 
  scale_x_continuous(name = "Experience - Total Working years") + 
  scale_y_continuous(name = "Monthly Income ($)") + 
  geom_smooth(method = "lm", color="black", size = 1.25)

#from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
lm_eqn <- function(Dat_Attrition){
  m <- lm(MonthlyIncome~TotalWorkingYears,data = Dat_Attrition);
  eq <- substitute(italic("MonthlyIncome") == a %*% + b %*% italic("TotalWorkingYears")*","~~italic(R)^2~"="~r2, 
                   list(a = format(unname(coef(m)[1]), digits = 5),
                        b = format(unname(coef(m)[2]), digits = 5),
                        r2 = format(summary(m)$r.squared, digits = 3)))
  as.character(as.expression(eq));
}

#Plot added with regression equation
p1Attr <- pAttr + geom_text(x = 10, y = 15000, label = lm_eqn(Dat_Attrition), 
                    parse = TRUE)

p1Attr
## `geom_smooth()` using formula 'y ~ x'

The above plot shows Attrition data Monthly Income vs. Total working year. Based on the equations shown on above plots it is observed that the slope for the attrition data is low as compared to the non-attrition data which suggest that the increase in Monthly Income is lower than the increase in Monthly Income for non-attrition group - per every unit increase in total working years.

#kNN approach to test the data sensitivity and specificity based on QOI

# kNN approach for 500 dataset iterations and 1-30 k values
iterations = 500 # number of iterations to test the k value
numks = 30      # number of k used in the iterations

# split percentage assumed from the full data set based on attrition/Fulldata
splitPerc = nrow(Dat_Attrition)/nrow(Dat_NonAttrition) #approx 19%



#Since the data is unbalanced (more non attrition than attrition), tried to create more balanced data using different split percentage of indices for non-attrition and attrition data.
#https://rpubs.com/amarnathbose/knn-R - conclusion used for determining training and test sets values.
trainIndices = sample(1:dim(Dat_NonAttrition)[1],
                     round(0.65*splitPerc * dim(Dat_NonAttrition)[1]))
trainIndices1 = sample(1:dim(Dat_Attrition)[1],
                     round(0.65 * dim(Dat_Attrition)[1]))
Attrition_train = rbind(Dat_NonAttrition[trainIndices,],
                        Dat_Attrition[trainIndices1,])
Attrition_test = rbind(Dat_NonAttrition[-trainIndices,],
                        Dat_Attrition[-trainIndices1,])
# write.csv(Attrition_train,file = "Attrition_train.csv")
# write.csv(Attrition_test,file = "Attrition_test.csv")
#Replacing Gender with Male as 0 Female as 1
Attrition_train$Gender <- ifelse(Attrition_train$Gender == "Male",0,1)
Attrition_test$Gender <- ifelse(Attrition_test$Gender  == "Male",0,1)

set.seed(5)
#columns that are considered significant for Attrition as discussed above are selected for kNN. An additional column of Monthly Income is added to the kNN attributes as this would be the attribute of interest for any client looking for attrition. 
classifications = knn(Attrition_train[,c("JobLevel","JobInvolvement","Age",
                                         "Gender","MonthlyIncome")],
Attrition_test[,c("JobLevel","JobInvolvement","Age","Gender","MonthlyIncome")],
                      Attrition_train$Attrition, prob = TRUE, k = 13)
CM = confusionMatrix(table(classifications,Attrition_test$Attrition))
#classifications
#CM
#Above Confusion Matrix and Statistics show a accuracy of ~60%



#Doing iterations to check best k value
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSensi = matrix(nrow = iterations, ncol = numks)
masterSpeci = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
  set.seed(5)
  accs = data.frame(accuracy = numeric(30), k = numeric(30))
  sensi = data.frame(sensitivity  = numeric(30), k = numeric(30))
  speci = data.frame(specificity  = numeric(30), k = numeric(30))
  for(i in 1:numks)
  {
classifications = knn(Attrition_train[,c("JobLevel","JobInvolvement","Age",
                                         "Gender","MonthlyIncome")], Attrition_test[,c("JobLevel","JobInvolvement","Age","Gender","MonthlyIncome")],
                      Attrition_train$Attrition, prob = TRUE, k = i)
    table(classifications,Attrition_test$Attrition)
    CM = confusionMatrix(table(classifications,Attrition_test$Attrition))
    masterAcc[j,i] = CM$overall[1]
    masterSensi[j,i] = CM$byClass[1]
    masterSpeci[j,i] = CM$byClass[2]
  }
}
kNN_MeanAcc = colMeans(masterAcc)
kNN_MeanSensi = colMeans(masterSensi)
kNN_MeanSpeci = colMeans(masterSpeci)
dfmeans <- data.frame(x = seq(1,numks,1),kNN_MeanAcc,kNN_MeanSensi,kNN_MeanSpeci)
p = ggplot(mapping = aes(x = seq(1,numks,1), y = kNN_MeanAcc,color="MeanAcc")) + geom_line() + ggtitle("Mean Accuracy, Mean Sensitivity & Mean Specificity  v. Number of k") +
  xlab('k values') + ylab(paste("Mean Accuracy, Mean Sensitivity\n Mean Specificity")) + geom_line(aes(y = kNN_MeanSensi,color="MeanSensi")) + geom_line(aes(y = kNN_MeanSpeci,color="MeanSpeci"))
#p

ggplotly(p)

The above plot shows the iterative analysis for k values vs. mean accuracy, mean sensitivity and mean specificity using kNN approach.

#Naive Bayes approach to test the data sensitivity and specificity based on QOI

# Selecting the splitperc and train indices and appoach for naive bayes is same as the kNN approach discussed above.
iterations = 500
nbmasterAcc = matrix(nrow = iterations)
nbmasterSensi = matrix(nrow = iterations)
nbmasterSpeci = matrix(nrow = iterations)
# split percentage assumed from the full data set based on attrition/Fulldata
splitPerc = nrow(Dat_Attrition)/nrow(Dat_NonAttrition) #approx 19%
nbtrainIndices = sample(1:dim(Dat_NonAttrition)[1],
                        round(0.8*splitPerc * dim(Dat_NonAttrition)[1]))
nbtrainIndices1 = sample(1:dim(Dat_Attrition)[1],
                     round(0.6 * dim(Dat_Attrition)[1]))
Attrition_trainnb = rbind(Dat_NonAttrition[nbtrainIndices,],
                        Dat_Attrition[nbtrainIndices1,])
Attrition_testnb = rbind(Dat_NonAttrition[-nbtrainIndices,],
                        Dat_Attrition[-nbtrainIndices1,])
# write.csv(Attrition_train,file = "Attrition_train.csv")
# write.csv(Attrition_test,file = "Attrition_test.csv")
#Replacing Gender with Male as 0 Female as 1
Attrition_trainnb$Gender <- ifelse(Attrition_trainnb$Gender == "Male",0,1)
Attrition_testnb$Gender <- ifelse(Attrition_testnb$Gender  == "Male",0,1)


for(j in 1:iterations)
{
  set.seed(10)
  #training
  NB_Attrition = naiveBayes(Attrition_trainnb[ ,c("JobLevel","JobInvolvement",
                                                 "Age",
                                                 "Gender","MonthlyIncome")],
                            as.factor(Attrition_trainnb$Attrition),laplace = 0)
  #Prediction
  table(predict(NB_Attrition,Attrition_testnb[,c("JobLevel","JobInvolvement",
                                            "Age","Gender",
                                            "MonthlyIncome")]),
        as.factor(Attrition_testnb$Attrition))
  CMNB = confusionMatrix(table(predict(NB_Attrition,
                                Attrition_testnb[,c("JobLevel","JobInvolvement",
                                                         "Age","Gender",
                                                         "MonthlyIncome")]),
                             as.factor(Attrition_testnb$Attrition)))
  nbmasterAcc[j] = CMNB$overall[1]
  nbmasterSensi[j] = CMNB$byClass[1]
  nbmasterSpeci[j] = CMNB$byClass[2]
}
NB_MeanAcc = colMeans(nbmasterAcc)
NB_MeanSensi = colMeans(nbmasterSensi)
NB_MeanSpeci = colMeans(nbmasterSpeci)
dfmeans_NB <- data.frame(NB_MeanAcc,NB_MeanSensi,NB_MeanSpeci)
dfmeans_NB
##   NB_MeanAcc NB_MeanSensi NB_MeanSpeci
## 1  0.7255193    0.7475728    0.4821429

Above output shows that a mean accuracy, mean specificity and mean sensitivity for Naive Bayes.

#Regression QOI#2

i <- 1
while (i<6) {
preg <- Fulldata %>% select(-ID, -BusinessTravel, -DistanceFromHome, 
                            - EmployeeCount, -EmployeeNumber) %>%
  gather(-MonthlyIncome, key = "var", value = "value") %>%
  ggplot(aes(x = value, y = MonthlyIncome)) +
  geom_point() + ggtitle(paste("Monthly Income v/s. attributes, Page",i)) + 
  scale_y_continuous(name = "Montly Income ($)") +
  theme_bw() + facet_wrap_paginate(~var, ncol = 3, nrow = 2, page = i) 
  i = i+1
  print(preg) 
}

From above Page 1 to Page 5 plots shows all the plots of Monthly income vs. major attributes. Based on the plots 1. Age, 2. Daily rate, 3. Education Field, 4. Education, 5. Hourly Rate, 6. Job Level, 7. Job Role, 8. Monthly Rate, 9. Percent Salary Hike, 10. Total working Years, 11 years at company are selected for further regression analysis.

#Intial regression

Ireg <- lm(MonthlyIncome~TotalWorkingYears + YearsAtCompany + 
             PercentSalaryHike + MonthlyRate + JobRole + JobLevel + 
             HourlyRate + Education + EducationField + DailyRate +
             Age, data = Fulldata)
Ireg
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + YearsAtCompany + 
##     PercentSalaryHike + MonthlyRate + JobRole + JobLevel + HourlyRate + 
##     Education + EducationField + DailyRate + Age, data = Fulldata)
## 
## Coefficients:
##                    (Intercept)               TotalWorkingYears  
##                     -8.849e-01                       5.237e+01  
##                 YearsAtCompany               PercentSalaryHike  
##                     -6.071e+00                      -1.529e+00  
##                    MonthlyRate          JobRoleHuman Resources  
##                     -9.006e-03                      -3.041e+02  
##   JobRoleLaboratory Technician                  JobRoleManager  
##                     -5.946e+02                       4.042e+03  
##  JobRoleManufacturing Director        JobRoleResearch Director  
##                      1.369e+02                       4.028e+03  
##      JobRoleResearch Scientist          JobRoleSales Executive  
##                     -3.252e+02                      -5.329e+01  
##    JobRoleSales Representative                        JobLevel  
##                     -4.264e+02                       2.792e+03  
##                     HourlyRate                       Education  
##                     -4.625e-01                      -3.170e+01  
##    EducationFieldLife Sciences         EducationFieldMarketing  
##                      1.182e+02                       4.665e+01  
##          EducationFieldMedical             EducationFieldOther  
##                      1.649e+01                       6.627e+01  
## EducationFieldTechnical Degree                       DailyRate  
##                      7.838e+01                       1.428e-01  
##                            Age  
##                     -5.341e-01
summary(Ireg)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + YearsAtCompany + 
##     PercentSalaryHike + MonthlyRate + JobRole + JobLevel + HourlyRate + 
##     Education + EducationField + DailyRate + Age, data = Fulldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3935.9  -640.7   -19.9   611.4  4109.9 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -8.849e-01  4.909e+02  -0.002 0.998562    
## TotalWorkingYears               5.237e+01  1.038e+01   5.045 5.56e-07 ***
## YearsAtCompany                 -6.071e+00  8.104e+00  -0.749 0.454021    
## PercentSalaryHike              -1.529e+00  9.949e+00  -0.154 0.877926    
## MonthlyRate                    -9.006e-03  5.142e-03  -1.751 0.080241 .  
## JobRoleHuman Resources         -3.041e+02  2.886e+02  -1.054 0.292331    
## JobRoleLaboratory Technician   -5.946e+02  1.703e+02  -3.491 0.000507 ***
## JobRoleManager                  4.042e+03  2.327e+02  17.372  < 2e-16 ***
## JobRoleManufacturing Director   1.369e+02  1.679e+02   0.815 0.415192    
## JobRoleResearch Director        4.028e+03  2.179e+02  18.486  < 2e-16 ***
## JobRoleResearch Scientist      -3.252e+02  1.702e+02  -1.910 0.056436 .  
## JobRoleSales Executive         -5.329e+01  1.541e+02  -0.346 0.729527    
## JobRoleSales Representative    -4.264e+02  2.176e+02  -1.959 0.050397 .  
## JobLevel                        2.792e+03  8.304e+01  33.619  < 2e-16 ***
## HourlyRate                     -4.625e-01  1.813e+00  -0.255 0.798764    
## Education                      -3.170e+01  3.688e+01  -0.860 0.390234    
## EducationFieldLife Sciences     1.182e+02  3.371e+02   0.351 0.725891    
## EducationFieldMarketing         4.665e+01  3.567e+02   0.131 0.895977    
## EducationFieldMedical           1.649e+01  3.381e+02   0.049 0.961125    
## EducationFieldOther             6.627e+01  3.641e+02   0.182 0.855624    
## EducationFieldTechnical Degree  7.838e+01  3.538e+02   0.222 0.824754    
## DailyRate                       1.428e-01  9.093e-02   1.571 0.116663    
## Age                            -5.341e-01  5.613e+00  -0.095 0.924225    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1065 on 847 degrees of freedom
## Multiple R-squared:  0.9477, Adjusted R-squared:  0.9464 
## F-statistic: 697.8 on 22 and 847 DF,  p-value: < 2.2e-16
CV(Ireg)
##           CV          AIC         AICc          BIC        AdjR2 
## 1.161656e+06 1.215356e+04 1.215498e+04 1.226800e+04 9.463564e-01
anova(Ireg)
## Analysis of Variance Table
## 
## Response: MonthlyIncome
##                    Df     Sum Sq    Mean Sq   F value  Pr(>F)    
## TotalWorkingYears   1 1.1133e+10 1.1133e+10 9818.1924 < 2e-16 ***
## YearsAtCompany      1 4.5003e+05 4.5003e+05    0.3969 0.52888    
## PercentSalaryHike   1 2.8221e+05 2.8221e+05    0.2489 0.61800    
## MonthlyRate         1 6.8217e+06 6.8217e+06    6.0158 0.01438 *  
## JobRole             8 4.9529e+09 6.1911e+08  545.9736 < 2e-16 ***
## JobLevel            1 1.3098e+09 1.3098e+09 1155.0391 < 2e-16 ***
## HourlyRate          1 2.3683e+04 2.3683e+04    0.0209 0.88513    
## Education           1 9.3283e+05 9.3283e+05    0.8226 0.36467    
## EducationField      5 1.7131e+06 3.4262e+05    0.3021 0.91168    
## DailyRate           1 2.7915e+06 2.7915e+06    2.4617 0.11702    
## Age                 1 1.0265e+04 1.0265e+04    0.0091 0.92422    
## Residuals         847 9.6047e+08 1.1340e+06                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From ANOVA output it can be seen that Total working years, Monthly Rate, Job Role, Job Level are the factors that are statistically significant (p<0.05). Using the above factors rerunning the regression analysis.

#Rerun regression

Rreg <- lm(MonthlyIncome~TotalWorkingYears + MonthlyRate + JobRole + 
             JobLevel, data = Fulldata)
Rreg
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + MonthlyRate + 
##     JobRole + JobLevel, data = Fulldata)
## 
## Coefficients:
##                   (Intercept)              TotalWorkingYears  
##                     2.352e+01                      4.803e+01  
##                   MonthlyRate         JobRoleHuman Resources  
##                    -8.916e-03                     -3.303e+02  
##  JobRoleLaboratory Technician                 JobRoleManager  
##                    -5.988e+02                      4.005e+03  
## JobRoleManufacturing Director       JobRoleResearch Director  
##                     1.414e+02                      4.013e+03  
##     JobRoleResearch Scientist         JobRoleSales Executive  
##                    -3.307e+02                     -6.786e+01  
##   JobRoleSales Representative                       JobLevel  
##                    -4.140e+02                      2.796e+03
summary(Rreg)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + MonthlyRate + 
##     JobRole + JobLevel, data = Fulldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3950.9  -635.3   -33.5   646.1  4226.5 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    2.352e+01  2.156e+02   0.109 0.913148    
## TotalWorkingYears              4.803e+01  7.941e+00   6.049 2.18e-09 ***
## MonthlyRate                   -8.916e-03  5.093e-03  -1.751 0.080366 .  
## JobRoleHuman Resources        -3.303e+02  2.515e+02  -1.313 0.189369    
## JobRoleLaboratory Technician  -5.988e+02  1.688e+02  -3.548 0.000409 ***
## JobRoleManager                 4.005e+03  2.284e+02  17.534  < 2e-16 ***
## JobRoleManufacturing Director  1.414e+02  1.670e+02   0.847 0.397464    
## JobRoleResearch Director       4.013e+03  2.144e+02  18.720  < 2e-16 ***
## JobRoleResearch Scientist     -3.307e+02  1.692e+02  -1.954 0.051013 .  
## JobRoleSales Executive        -6.786e+01  1.441e+02  -0.471 0.637696    
## JobRoleSales Representative   -4.140e+02  2.114e+02  -1.958 0.050516 .  
## JobLevel                       2.796e+03  8.169e+01  34.228  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1061 on 858 degrees of freedom
## Multiple R-squared:  0.9474, Adjusted R-squared:  0.9467 
## F-statistic:  1404 on 11 and 858 DF,  p-value: < 2.2e-16
CV(Rreg)
##           CV          AIC         AICc          BIC        AdjR2 
## 1.140337e+06 1.213723e+04 1.213765e+04 1.219922e+04 9.466978e-01
anova(Rreg)
## Analysis of Variance Table
## 
## Response: MonthlyIncome
##                    Df     Sum Sq    Mean Sq   F value Pr(>F)    
## TotalWorkingYears   1 1.1133e+10 1.1133e+10 9881.0866 <2e-16 ***
## MonthlyRate         1 7.0273e+06 7.0273e+06    6.2369 0.0127 *  
## JobRole             8 4.9424e+09 6.1780e+08  548.3031 <2e-16 ***
## JobLevel            1 1.3200e+09 1.3200e+09 1171.5314 <2e-16 ***
## Residuals         858 9.6675e+08 1.1267e+06                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA output data for the rerun regression looks good. Continuing with the rerun regression model and visualizing the scatter plot.

#Regression Considering JobRole as a numeric variable.
Rregdata <- Fulldata %>% mutate(JobRole = as.numeric(as.factor(JobRole)))
Rreg1 <- lm(MonthlyIncome~TotalWorkingYears + MonthlyRate + JobRole + 
             JobLevel, data = Rregdata)
Rreg1
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + MonthlyRate + 
##     JobRole + JobLevel, data = Rregdata)
## 
## Coefficients:
##       (Intercept)  TotalWorkingYears        MonthlyRate            JobRole  
##        -1.770e+03          5.594e+01         -4.044e-03          4.427e+00  
##          JobLevel  
##         3.715e+03
summary(Rreg1)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + MonthlyRate + 
##     JobRole + JobLevel, data = Rregdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5439.3  -889.5    58.8   721.9  3919.9 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.770e+03  1.762e+02 -10.048  < 2e-16 ***
## TotalWorkingYears  5.594e+01  1.012e+01   5.525 4.35e-08 ***
## MonthlyRate       -4.044e-03  6.657e-03  -0.608    0.544    
## JobRole            4.427e+00  1.937e+01   0.228    0.819    
## JobLevel           3.715e+03  6.943e+01  53.512  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1391 on 865 degrees of freedom
## Multiple R-squared:  0.9089, Adjusted R-squared:  0.9085 
## F-statistic:  2157 on 4 and 865 DF,  p-value: < 2.2e-16
CV(Rreg1)
##           CV          AIC         AICc          BIC        AdjR2 
## 1.946733e+06 1.260067e+04 1.260077e+04 1.262928e+04 9.084729e-01
anova(Rreg1)
## Analysis of Variance Table
## 
## Response: MonthlyIncome
##                    Df     Sum Sq    Mean Sq   F value  Pr(>F)    
## TotalWorkingYears   1 1.1133e+10 1.1133e+10 5754.3969 < 2e-16 ***
## MonthlyRate         1 7.0273e+06 7.0273e+06    3.6321 0.05701 .  
## JobRole             1 1.5289e+07 1.5289e+07    7.9020 0.00505 ** 
## JobLevel            1 5.5403e+09 5.5403e+09 2863.5248 < 2e-16 ***
## Residuals         865 1.6736e+09 1.9348e+06                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Rerun regression scatter plot

pRreg <- Fulldata %>% select(MonthlyIncome,TotalWorkingYears,MonthlyRate,
                             JobRole,JobLevel) %>%
  gather(-MonthlyIncome, key = "var", value = "value") %>%
  ggplot(aes(x=value, y=MonthlyIncome)) +
  geom_point() + 
  ggtitle(paste("Monthly Income v/s. Statistically significant attributes")) + 
  scale_y_continuous(name = "Montly Income ($)") +
  theme_bw() + facet_wrap(.~var, scales = "free") 
#pRreg
#Above plot is only for verification to sort all variables for a good plot
pRreg1 <- Fulldata %>% select(MonthlyIncome,TotalWorkingYears,MonthlyRate,
                             JobRole,JobLevel) %>%
  ggplot(aes(x=TotalWorkingYears, y=MonthlyIncome)) +
  geom_point() + 
  ggtitle(paste("Monthly Income v/s. Total Working Years")) + 
  scale_y_continuous(name = "Montly Income ($)") + 
  scale_x_continuous(name = "Total Working Years") + 
  theme_bw() + geom_smooth(method = "lm")

pRreg2 <- Fulldata %>% select(MonthlyIncome,TotalWorkingYears,MonthlyRate,
                             JobRole,JobLevel) %>%
  mutate(bin=cut_width(MonthlyRate, width=0.5, boundary=0)) %>%
  ggplot(aes(x = bin, y = MonthlyIncome)) +
  geom_boxplot() +
  ggtitle(paste("Monthly Income v/s. Monthly Rate")) + 
  scale_y_continuous(name = "Montly Income ($)") + 
    scale_x_discrete(name = "Monthly Rate ($)")+
    theme_bw()

pRreg3 <- Fulldata %>% select(MonthlyIncome,TotalWorkingYears,MonthlyRate,
                             JobRole,JobLevel) %>% 
  mutate(JobRole = fct_reorder(JobRole, MonthlyIncome, .fun='median' )) %>% 
  ggplot(aes(x = fct_reorder(JobRole, MonthlyIncome), y = MonthlyIncome)) +
  geom_point() + 
  ggtitle(paste("Monthly Income v/s. Job Role")) + 
  scale_y_continuous(name = "Montly Income ($)") + 
  scale_x_discrete(name = "Job Role") +
  theme_bw() + 
  theme(axis.text.x=element_text(angle=25,hjust=1,size=10))

pRreg4 <- Fulldata %>% select(MonthlyIncome,TotalWorkingYears,MonthlyRate,
                             JobRole,JobLevel) %>% 
  mutate(JobLevel = reorder(JobLevel, MonthlyIncome, .fun='median' )) %>% 
  ggplot(aes(x = reorder(JobLevel, MonthlyIncome), y = MonthlyIncome)) +
  geom_point() + 
  ggtitle(paste("Monthly Income v/s. Job Level")) + 
  scale_y_continuous(name = "Montly Income ($)") + 
  scale_x_discrete(name = "Job Level") +
  theme_bw() + geom_smooth(method = "lm")
#Grid.draw is used to plot all gender plots at once
grid.draw(cbind(ggplotGrob(pRreg1),
                ggplotGrob(pRreg2)))
## `geom_smooth()` using formula 'y ~ x'

grid.draw(cbind(ggplotGrob(pRreg3),
                ggplotGrob(pRreg4)))
## `geom_smooth()` using formula 'y ~ x'

Above plots show the visualization for Monthly Income v/s. all the statistically significant explanatory variables for Monthly Income.

Income_predict = predict(Rreg)
RMSE_Income = sqrt(mean((Fulldata$MonthlyIncome-Income_predict)^2))
RMSE_Income
## [1] 1054.136

RMSE for income is < $3000; QOI#2 satisfied.

CaseStudy - 02 YouTube Link: https://youtu.be/xjNT9R3IuH4